read data:

getwd()
## [1] "E:/2021\u79cb\u62db/BaiduNetdiskWorkspace/\u54e5\u5927qmss/DV/Group_T_digital-products"
setwd("E:/2021秋招/BaiduNetdiskWorkspace/哥大qmss/DV/Group_T_digital-products/data")
bar <- read.csv("bar.csv")
barreview <- read.csv("barreview.csv")
bus_attr <- read.csv("business_allatt.csv")
user <- read.csv("10%eliteuser.csv")

load library:

#library(tidyverse) 
library(stringr)
library(wordcloud) 
## Loading required package: RColorBrewer
library(tidytext) 
library(DT) 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(leaflet) 

Map Visualization for Vegas Bars

library(readr)
library(ggplot2)
library(ggthemes)
library(maps)
library(dplyr)
library("DT")
library(stringr)
library("leaflet")
library(RColorBrewer)
library("data.table")
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
register_google(key = "AIzaSyBlZS1dMHmz5qAZww5VQRBBSlJa3VFfyVg", write = TRUE)
## Replacing old key (AIzaSyBlZS1dMHmz5qAZww5VQRBBSlJa3VFfyVg) with new key in C:\Users\lenove\Documents/.Renviron

Las Vegas Map

map_lv <- get_map("Las Vegas",
                            zoom = 12,
                            source = "stamen",
                            maptype = "toner-background")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=Las%20Vegas&zoom=12&size=640x640&scale=2&maptype=terrain&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Las+Vegas&key=xxx
## Source : http://tile.stamen.com/toner-background/12/736/1604.png
## Source : http://tile.stamen.com/toner-background/12/737/1604.png
## Source : http://tile.stamen.com/toner-background/12/738/1604.png
## Source : http://tile.stamen.com/toner-background/12/739/1604.png
## Source : http://tile.stamen.com/toner-background/12/736/1605.png
## Source : http://tile.stamen.com/toner-background/12/737/1605.png
## Source : http://tile.stamen.com/toner-background/12/738/1605.png
## Source : http://tile.stamen.com/toner-background/12/739/1605.png
## Source : http://tile.stamen.com/toner-background/12/736/1606.png
## Source : http://tile.stamen.com/toner-background/12/737/1606.png
## Source : http://tile.stamen.com/toner-background/12/738/1606.png
## Source : http://tile.stamen.com/toner-background/12/739/1606.png
## Source : http://tile.stamen.com/toner-background/12/736/1607.png
## Source : http://tile.stamen.com/toner-background/12/737/1607.png
## Source : http://tile.stamen.com/toner-background/12/738/1607.png
## Source : http://tile.stamen.com/toner-background/12/739/1607.png
ggmap(map_lv)

map bar location

g_location <- ggmap(map_lv) + theme_map()
g_location + geom_point(data=bar, aes(x=longitude,y=latitude),
                    size=0.3, alpha=0.3, color="blue")

highlight the bar hot spot

g_density <- g_location + geom_density2d(aes(x=longitude,y=latitude), 
  data=bar, color="green", size=1, bins=12) +
  stat_density2d(aes(x=longitude,y=latitude,
    fill = ..level.., alpha = ..level..),
    data=bar, geom = 'polygon', bins=12) +
  scale_fill_gradient2(low = "green", mid="yellow", high = "red") +
  scale_alpha(range = c(0.00, 0.5)) 
g_density

# Visualize the neighborhood each bar belongs to

#add legend of stars
lvbar_map_neighborhood <- 
  leaflet(bar) %>%
  addTiles() %>%    # Add OpenStreetMap map tiles
  addCircles(lng = ~longitude, lat = ~latitude)

pal = colorFactor("Set1", domain = bar$neighborhood) # Grab a palette
color_neighborhood = pal(bar$neighborhood)

lvbar_map_neighborhood %>% addCircles(color=color_neighborhood) %>%
  addLegend(pal = pal, values = ~bar$neighborhood, title = "Neighborhood")
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
content <- paste("Name:",bar$name,"<br/>",
                 "Address:",bar$address,"<br/>",
                 "Stars:",bar$stars,"<br/>",
                 "Neighborhood:", bar$neighborhood,"<br/>")

pal = colorFactor("YlOrRd", domain = bar$stars) # Grab a palette
color_stars = pal(bar$stars)

lvbar_map_neighborhood %>% addCircles(color=color_stars, popup = content) %>%
  addLegend(pal = pal, values = ~bar$stars, title = "Stars")
## Assuming "longitude" and "latitude" are longitude and latitude, respectively

Yelp bar review- text analysis:

Most Popular Categories regarding bars:

fillColor = "#FFA07A"
fillColor2 = "#F1C40F"

categories = str_split(bar$categories,";")
categories = as.data.frame(unlist(categories))
colnames(categories) = c("Name")

categories %>%
  group_by(Name) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  ungroup() %>%
  mutate(Name = reorder(Name,Count)) %>%
  head(10) %>%
  
  
  ggplot(aes(x = Name,y = Count)) +
  geom_bar(stat='identity',colour="white", fill =fillColor2) +
  geom_text(aes(x = Name, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Name of Category', y = 'Count', 
       title = 'Top 10 Categories regarding bars') +
  coord_flip() + 
  theme_bw()

Bars with most number of five Star Reviews:

stars_5 <- barreview %>%
  filter(stars ==5) %>%
  group_by(business_id) %>%
  select(business_id,stars,text) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  ungroup() 


five = merge(stars_5, bar, by= "business_id")
five2 <- five %>%
  filter(stars ==5) %>%
  filter(is_open==1)

fivestar <- five2 %>%
  arrange(stars) %>%
  head(10)
fillColor2 = "#F1C40F"

fivestar %>%
  mutate(name = reorder(name,Count)) %>%
  ggplot(aes(x = name,y = Count)) +
  geom_bar(stat='identity',colour="white", fill = fillColor2) +
  geom_text(aes(x = name, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 2, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Name of the Bars', 
       y = 'Count', 
       title = 'Name of the bars and Count') +
  coord_flip() +
  theme_bw()

Most 5 starred bar - J Karaoke Bar:

J_karaoke = bar %>% filter(business_id == "3pSUr_cdrphurO6m1HMP9A") %>%
  select(name,neighborhood,city,state,postal_code,categories)

datatable(head(J_karaoke), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))

A wordcloud to see the common words of reviews on “J Karaoke Bar”

createWordCloud = function(train)
{
  train %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word) %>%
  count(word,sort = TRUE) %>%
  ungroup()  %>%
  head(30) %>%
  
  with(wordcloud(word, n, max.words = 30,colors=brewer.pal(8, "Dark2")))
}

createWordCloud(barreview %>%
  filter(business_id == "3pSUr_cdrphurO6m1HMP9A"))

From the wordcloud, we can derive the ingisht that people praise the atmosphere, music, cleaning environment, services and food(especially chicken) in this bar, and indicates that they spend happy and comfortable time in this J Karaoke bar.

Similarly, let’s visualize the bars with most number of one star reviews:

#library()
stars_1 <- barreview %>%
  filter(stars ==1|stars==1.5) %>%
  group_by(business_id) %>%
  select(business_id,stars,text) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  ungroup() 


one = merge(stars_1, bar, by= "business_id")
one2 <- one %>%
  filter(stars ==1|stars==1.5) %>%
  filter(is_open==1)

onestar <- one2 %>%
  arrange(stars) %>%
  head(10)

fillColor2 = "#F1C40F"

onestar %>%
  mutate(name = reorder(name,Count)) %>%
  ggplot(aes(x = name,y = Count)) +
  geom_bar(stat='identity',colour="white", fill = fillColor2) +
  geom_text(aes(x = name, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 2, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Name of the Bars', 
       y = 'Count', 
       title = 'Name of the bars and Count') +
  coord_flip() +
  theme_bw()

Surprisingly, the bar named “Triumph property management” only has one star rating, and there are 12 reviews on that bar.

So we are interested to see the common words of reviews on “Triumph property management”:

createWordCloud = function(train)
{
  train %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word) %>%
  count(word,sort = TRUE) %>%
  ungroup()  %>%
  head(30) %>%
  
  with(wordcloud(word, n, max.words = 30,colors=brewer.pal(8, "Dark2")))
}

createWordCloud(barreview %>%
  filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ"))

People in their reviews complain about the house/environment of the bar.

Let’s create a datatable to see some information regarding “Triumph property management”:

Triumph = bar %>% filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ") %>%
  select(name,neighborhood,city,state,postal_code,categories)

datatable(head(Triumph), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))

define 5-star and 1-star bar datasets with reviews

goodbar <- barreview %>%
  filter(stars == 5) %>%
  group_by(business_id) %>%
  ungroup()

star_five <- merge(goodbar,bar,by = "business_id")

badbar <- barreview %>%
  filter(stars == 1) %>%
  group_by(business_id) %>%
  ungroup()

star_one <- merge(badbar,bar,by = "business_id")

preprocessing reviews

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
df_five = data.frame(doc_id = star_five$business_id, text = star_five$text,stringsAsFactors = F)
star_five2 <- DataframeSource(df_five)
star_five2 <- VCorpus(star_five2)

df_one = data.frame(doc_id = star_one$business_id, text = star_one$text,stringsAsFactors = F)
star_one2 <- DataframeSource(df_one)
star_one2 <- VCorpus(star_one2)
#Remove unnecessary words(stop words), synatx, punctuation, numbers, white space etc.
library(stringr)
remove_nonalphanum <- function(x){str_replace_all(x, "[^[:alnum:]]", " ")}
remove_brandnames <- function(x){str_replace_all(x, "\\b[A-Z]+\\b", " ")}

clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, content_transformer(remove_nonalphanum))
  corpus <- tm_map(corpus, content_transformer(remove_brandnames))
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, stripWhitespace)
  return(corpus)
}
#cleaning two datasets
star_five_clean <- clean_corpus(star_five2)
star_one_clean <- clean_corpus(star_one2)
#create a document-term-matrix:
library(tm)
#create the dtm from the corpus
corpus_five_dtm <- DocumentTermMatrix(star_five_clean)
corpus_one_dtm <- DocumentTermMatrix(star_one_clean)
#provide a word cloud of the most frequent words for "five_star" bars and "one_star" bars
library(tidytext)
corpus_five_dt <- tidy(corpus_five_dtm)
corpus_one_dt <- tidy(corpus_one_dtm) 
head(corpus_five_dt)
## # A tibble: 6 x 3
##   document               term      count
##   <chr>                  <chr>     <dbl>
## 1 -1m9o3vGRA8IBPNvNqKLmA adjust        1
## 2 -1m9o3vGRA8IBPNvNqKLmA amazing       1
## 3 -1m9o3vGRA8IBPNvNqKLmA awesome       1
## 4 -1m9o3vGRA8IBPNvNqKLmA beautiful     1
## 5 -1m9o3vGRA8IBPNvNqKLmA best          1
## 6 -1m9o3vGRA8IBPNvNqKLmA dark          1
#tf-idf
corpus_five_tdidf <- corpus_five_dt %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf_idf))

corpus_one_tdidf <- corpus_one_dt %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf_idf))

head(corpus_five_tdidf)
## # A tibble: 6 x 6
##   document               term          count     tf   idf tf_idf
##   <chr>                  <chr>         <dbl>  <dbl> <dbl>  <dbl>
## 1 nSQBL7E9JdN40Dcg-QiFJA tyrone            2 0.182   6.01  1.09 
## 2 fxHqXVqED2jYwCSgkKJTjA idiosyncratic     2 0.111   7.11  0.790
## 3 kpC__sWtWkLdSOI2xxdirg smithwicks        1 0.167   4.71  0.785
## 4 C3IwicBceqbeY1-JNuqd0g chillm            2 0.105   7.11  0.748
## 5 QgWPqUuDFm5wF5UpNECZYg phillip           2 0.133   4.91  0.655
## 6 ri4JWuvJQOOkoeWN5ZNC9A wildcats          1 0.0909  7.11  0.646
term_frequency_DT_five <- corpus_five_tdidf
term_frequency_DT_one <- corpus_one_tdidf

library(wordcloud)
#Set seed
set.seed(2103)

#create a wordcloud to show the frequent words of five stars bars
wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf,
          max.words=50, colors=brewer.pal(8, "Dark2"))

#create a wordcloud to show the frequent words of one stars bars
wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf,
          max.words=50, colors=brewer.pal(8, "Dark2"))

A pyramid plot to show how the words between five-stars and one-stars bars differ in word frequency:

#combine corpus of the most successful and unsuccessful projects
#select top 20 words
corpus_five_dt$bestworst <- "Top"
corpus_one_dt$bestworst <- "Bottom"
corpus_top_bottom_dt <- rbind(corpus_five_dt,corpus_one_dt)


corpus_top_bottom_count <- corpus_top_bottom_dt %>%
  group_by(term) %>%
  summarize(total_word = sum(count)) %>%
  arrange(desc(total_word)) %>%
  head(20)
pyramid = left_join(corpus_top_bottom_dt, corpus_top_bottom_count, by='term')

pyramid <- pyramid %>%
  filter(!is.na(total_word)) %>%
  group_by(bestworst) %>%
  mutate(count_plot = ifelse(bestworst == 'Bottom', count*(-1), count))
ggplot(pyramid, aes(x = reorder(term, total_word),
                  y = count_plot, fill = bestworst)) +
  geom_bar(data = filter(pyramid, bestworst == "Top"), stat = "identity") +
  geom_bar(data = filter(pyramid, bestworst == "Bottom"), stat = "identity") +
  scale_fill_brewer(palette = "Set1", direction=-1) +
  coord_flip() +
  scale_y_continuous(breaks = seq(-50,50,25)) +
  scale_fill_discrete(name = 'bars star rating', labels=c('one star', 'five star')) +
  ylab("") +
  ggthemes::theme_tufte() + 
  labs(
    x = 'Top 20 Words',
    y= 'Count',
    title = 'Pyramid Plot of Top 20 Words, for one star bars and five star bars'
  )
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

Sentiment analysis of reviews:

Positive v.s. negative words in the reivews of J Karaoke Bar

library(tidytext)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.0     v purrr   0.3.4
## v tidyr   1.2.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x NLP::annotate()       masks ggplot2::annotate()
## x data.table::between() masks dplyr::between()
## x dplyr::filter()       masks stats::filter()
## x data.table::first()   masks dplyr::first()
## x dplyr::lag()          masks stats::lag()
## x data.table::last()    masks dplyr::last()
## x purrr::map()          masks maps::map()
## x purrr::transpose()    masks data.table::transpose()
positiveWordsBarGraph <- function(SC) {
  contributions <- SC %>%
    unnest_tokens(word, text) %>%
    count(word,sort = TRUE) %>%
    ungroup() %>%
    
    inner_join(get_sentiments("afinn"), by = "word") %>%
    group_by(word) %>%
    summarize(occurences = n(),
              contribution = sum(value))

  
  contributions %>%
    top_n(20, abs(contribution)) %>%
    mutate(word = reorder(word, contribution)) %>%
    head(20) %>%
    ggplot(aes(word, contribution, fill = contribution > 0)) +
    geom_col(show.legend = FALSE) +
    coord_flip() + theme_bw()
}

positiveWordsBarGraph(barreview %>%
                       filter(business_id == "3pSUr_cdrphurO6m1HMP9A"))

calculate sentiment for “J Karaoke Bar”

J_Karaoke_reviews = star_five %>%
  filter(business_id == "3pSUr_cdrphurO6m1HMP9A")

calculate_sentiment <- function(review)
{
  sentiment_lines  =  review %>%
                  unnest_tokens(word, text) %>%
                  inner_join(get_sentiments("afinn"), by = "word") %>%
                  group_by(user_id) %>%
                  summarize(sentiment = mean(value),words = n()) %>%
                  ungroup() %>%
                  filter(words >= 10) 

  return(sentiment_lines)
  
}


sentiment_lines = calculate_sentiment(J_Karaoke_reviews)

head(sentiment_lines)
## # A tibble: 6 x 3
##   user_id                sentiment words
##   <chr>                      <dbl> <int>
## 1 2wxtnu-tA8i9HjHD55iU6g     1.91     11
## 2 3iocNPlPThAG2ZaNtUo4TQ     0.733    30
## 3 8dxkcmAXY4ttrVFD1GhbdQ     1.67     21
## 4 aF0BTeTVRXv4OHYXMNH7SQ     2.12     17
## 5 bRzr5YuEIncFzG6_vYSwcw     1.73     11
## 6 buSbz1HfaHoXP3QGw2XV_Q     1.42     12

Display top 10 most positive reviews for 5 star bars:

display_pos_sentiments <- function(sentiment_lines,review_text)
{
  pos_sentiment_lines = sentiment_lines %>%
  arrange(desc(sentiment))  %>%
  top_n(10, sentiment) %>%
  inner_join(review_text, by = "user_id") %>%
  select(date,sentiment,text) 
  
datatable(pos_sentiment_lines, style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))

}

display_pos_sentiments(sentiment_lines,J_Karaoke_reviews)

Positive v.s. negative words in the reivews of Triumph property management

positiveWordsBarGraph <- function(SC) {
  contributions <- SC %>%
    unnest_tokens(word, text) %>%
    count(word,sort = TRUE) %>%
    ungroup() %>%
    
    inner_join(get_sentiments("afinn"), by = "word") %>%
    group_by(word) %>%
    summarize(occurences = n(),
              contribution = sum(value))

  
  contributions %>%
    top_n(20, abs(contribution)) %>%
    mutate(word = reorder(word, contribution)) %>%
    head(20) %>%
    ggplot(aes(word, contribution, fill = contribution > 0)) +
    geom_col(show.legend = FALSE) +
    coord_flip() + theme_bw()
}

positiveWordsBarGraph(barreview %>%
                       filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ"))

Triumph_reviews = barreview %>%
  filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ")

calculate_sentiment <- function(review)
{
  sentiment_lines  =  review %>%
                  unnest_tokens(word, text) %>%
                  inner_join(get_sentiments("afinn"), by = "word") %>%
                  group_by(user_id) %>%
                  summarize(sentiment = mean(value),words = n()) %>%
                  ungroup() %>%
                  filter(words >= 10) 

  return(sentiment_lines)
  
}


sentiment_lines = calculate_sentiment(Triumph_reviews)

Display top 10 most negative reviews for Triumph property management:

display_neg_sentiments <- function(sentiment_lines,review_text)
{
  neg_sentiment_lines = sentiment_lines %>%
  arrange(desc(sentiment))  %>%
  top_n(-10, sentiment) %>%
  inner_join(review_text, by = "user_id") %>%
  select(date,sentiment,text) 
  
datatable(neg_sentiment_lines, style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
}

display_neg_sentiments(sentiment_lines,Triumph_reviews)

Visualize the geographical location of the top 10 five star bars(blue dot) and bottom 1 star or 1.5 star bars(yellow dot):

library(devtools)
## Loading required package: usethis
devtools::install_github("rstudio/leaflet")
## WARNING: Rtools is required to build R packages, but no version of Rtools compatible with R 4.0.4 was found. (Only the following incompatible version(s) of Rtools were found: 3.5)
## 
## Please download and install Rtools 4.0 from https://cran.r-project.org/bin/windows/Rtools/.
## Downloading GitHub repo rstudio/leaflet@HEAD
## utf8         (1.2.1  -> 1.2.2  ) [CRAN]
## cli          (3.1.1  -> 3.3.0  ) [CRAN]
## colorspace   (2.0-2  -> 2.0-3  ) [CRAN]
## vctrs        (0.3.8  -> 0.4.1  ) [CRAN]
## magrittr     (2.0.1  -> 2.0.3  ) [CRAN]
## fansi        (0.4.2  -> 1.0.3  ) [CRAN]
## RColorBrewer (1.1-2  -> 1.1-3  ) [CRAN]
## tibble       (3.1.0  -> 3.1.7  ) [CRAN]
## glue         (1.4.2  -> 1.6.2  ) [CRAN]
## digest       (0.6.27 -> 0.6.29 ) [CRAN]
## Rcpp         (1.0.8  -> 1.0.8.3) [CRAN]
## ggplot2      (3.3.5  -> 3.3.6  ) [CRAN]
## sp           (1.4-6  -> 1.4-7  ) [CRAN]
## xfun         (0.29   -> 0.30   ) [CRAN]
## yaml         (2.2.2  -> 2.3.5  ) [CRAN]
## jsonlite     (1.7.3  -> 1.8.0  ) [CRAN]
## Installing 16 packages: utf8, cli, colorspace, vctrs, magrittr, fansi, RColorBrewer, tibble, glue, digest, Rcpp, ggplot2, sp, xfun, yaml, jsonlite
## Installing packages into 'C:/Users/lenove/Documents/R/win-library/4.0'
## (as 'lib' is unspecified)
## 
##   There is a binary version available but the source version is later:
##     binary source needs_compilation
## cli  3.2.0  3.3.0              TRUE
## 
##   Binaries will be installed
## package 'utf8' successfully unpacked and MD5 sums checked
## package 'cli' successfully unpacked and MD5 sums checked
## package 'colorspace' successfully unpacked and MD5 sums checked
## package 'vctrs' successfully unpacked and MD5 sums checked
## package 'magrittr' successfully unpacked and MD5 sums checked
## package 'fansi' successfully unpacked and MD5 sums checked
## package 'glue' successfully unpacked and MD5 sums checked
## package 'digest' successfully unpacked and MD5 sums checked
## package 'Rcpp' successfully unpacked and MD5 sums checked
## package 'sp' successfully unpacked and MD5 sums checked
## package 'xfun' successfully unpacked and MD5 sums checked
## package 'yaml' successfully unpacked and MD5 sums checked
## package 'jsonlite' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\lenove\AppData\Local\Temp\RtmpW8Y3GO\downloaded_packages
## WARNING: Rtools is required to build R packages, but no version of Rtools compatible with R 4.0.4 was found. (Only the following incompatible version(s) of Rtools were found: 3.5)
## 
## Please download and install Rtools 4.0 from https://cran.r-project.org/bin/windows/Rtools/.
## * checking for file 'C:\Users\lenove\AppData\Local\Temp\RtmpW8Y3GO\remotes1ac8117e4863\rstudio-leaflet-0016c07/DESCRIPTION' ... OK
## * preparing 'leaflet':
## * checking DESCRIPTION meta-information ... OK
## * checking for LF line-endings in source and make files and shell scripts
## * checking for empty or unneeded directories
## Removed empty directory 'leaflet/docs'
## Removed empty directory 'leaflet/man-roxygen'
## * building 'leaflet_2.1.1.tar.gz'
## 
LasvegasCoords = bar %>% filter(city == "Las Vegas")
center_lon = median(LasvegasCoords$longitude,na.rm = TRUE)
center_lat = median(LasvegasCoords$latitude,na.rm = TRUE)

map <- leaflet(rbind(fivestar,onestar)) %>%
  addProviderTiles("Esri.NatGeoWorldMap") %>%
  addCircles(lng = ~longitude, lat = ~latitude,radius = ~sqrt(review_count))  %>%
  addCircleMarkers(data=fivestar,col="blue",group="fivestar") %>%
  addCircleMarkers(data=onestar,color='yellow',group="onestar") %>%
    #Layers control
  addLayersControl(overlayGroups = c("fivestar","onestar"),
                   options = layersControlOptions(collapsed = FALSE)
                   ) %>%
  # controls
  setView(lng=center_lon, lat=center_lat,zoom = 13)
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
map

Top five stars bars are more centralized and concentrated; while the worst one star bars are relatively more sparse in their location, and are not close to transportation hub.

Social network analysis & regression models:

There are mainly three parts in this analysis. p1, p8 are network graphs, p2-p5 are descriptive analysis graphs, p6-p7 are correlation graphs.

library(readr)
library(networkD3)
## 
## Attaching package: 'networkD3'
## The following object is masked from 'package:leaflet':
## 
##     JS
## The following object is masked from 'package:DT':
## 
##     JS
library(visNetwork)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
## 
##     wind
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(sjPlot)
## Registered S3 method overwritten by 'parameters':
##   method                         from      
##   format.parameters_distribution datawizard

##Among the elite users who have reviewed any of bars in Vegas, what are their relationship? Is there some inside network between these celebrities in Yelp? Or do they just review and travel separately? To figure it out, I find those elite users who also has at least one elite user friend and construct a network between them. As a result, most of the links are individual and not connected to another links. There exists A-B-C chains, which means A is B’s friend, and B is C’s friend, but no classical social network “circle” or group. On the other word, A is not C’s friend. Thus, as Yelp is not a social media, but a review app, elite users don’t interact with each other in the app. It is more possible that every elite user is the center of his/her fans, not the center of his/her peers.

## Every user has reviewed one of the bars in Vegas

name <- user[, c("user_id", "friends")]

## 1. Network graph of top elite users(whose fans >= 100 and useful value >=  900)
net <- separate_rows(name, friends, sep = ",", convert = TRUE)
## No outside friends now
net <- net %>%
  filter(friends %in% user_id)
## Delete users with no inside followers

net <- rename(net, source = user_id)
net <- rename(net, target = friends)

netnode <- user %>%
  filter(user_id %in% net$source)
summary(netnode$average_stars)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.350   3.632   3.860   3.838   4.010   4.470
net <- as.data.frame(net)
p1 <- simpleNetwork(net, 
              nodeColour = "blue", 
              zoom=T,
              fontSize = 16)

##  There is no obvious network between the elite users
p1

Furthermore, it is obvious that the locaion doesn’t separate the bars in different groups. Perhaps Vegas is not a big place, so the distance doesn’t matter.

## 8. Append: Network of bars
rb <- barreview %>%
  group_by(business_id) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  head(50)
r50 <- barreview %>%
   filter(business_id %in% rb$business_id) %>%
   select(user_id, business_id)

## One bar have a branch in very close location also listed in top50, so actually the number of bars is 49

r50 <- left_join(r50, bar, by = "business_id")
r50 <- r50[, c("user_id", "name", "neighborhood", "review_count")]
r50$name<-gsub('["]',"", r50$name)
r50$name <- ifelse(r50$name == "Bachi Burger" & r50$neighborhood == "Southeast",  "Bachi Burger(SE)", r50$name)
r50 <- mutate(r50, barnum = as.factor(name))
barlink <- NULL
 for (i in 1 : 48) {
   for (j in i : 48) {
      a <- r50 %>% filter(as.numeric(barnum) == i)
      a <- a$user_id
      b <- r50 %>% filter(as.numeric(barnum) == j)
      b <- b$user_id
      c <- intersect(a, b)
      d <- length(c)
      tmp <- c(i, j, d)
      barlink <- rbind(barlink, tmp)
   }
}
barlink <- as.data.frame(barlink)
barlink <- barlink %>% 
  filter(V1 != V2) %>% 
  filter(V3 > 0)
barlink <- rename(barlink, source = V1, target = V2, value = V3)
barlink2 <- barlink %>% filter(value >= 50)
barlink2$value <- barlink2$value / 50
barlink2$source = barlink2$source - 1
barlink2$target = barlink2$target - 1  
barnode <- r50[, c("barnum", "neighborhood", "review_count")] %>% 
     distinct()
barnode$neighborhood <- as.factor(barnode$neighborhood)
barnode$rcsize <- barnode$review_count / 100 - 10
p8 <- forceNetwork(Links = barlink2, 
             Nodes = barnode, 
             Source = "source",
             Target = "target", 
             Value = "value", 
             NodeID = "barnum",
             Nodesize = "rcsize",
             Group = "neighborhood", 
             opacity = 0.6, zoom = TRUE)
p8

This picture tells you the distribution of bars in Vegas’ neighborhood. The strip is leading, and Eastside and Downtown are also great. However, if we take the average review scores into account, maybe Downtown is the best area for entertainment.

neighborhood <- bar %>%
  group_by(neighborhood) %>%
  summarize(avgstar = mean(stars), count = n())
neighborhood[1, 1] <- "Not known"
neighborhoodgraph <- ggplot(neighborhood, aes(x = count, y = reorder(neighborhood, count), fill = round(avgstar, 2))) + 
  geom_bar(stat="identity", width=1, color="white") +
  labs(x="review count", y="name of neighborhood")

## 3. Which neighborhood has most bars?
p3 <- ggplotly(neighborhoodgraph)
p3

This table tells you if we define the bars who have in average more than 4 stars and greater than 100 reviews are the top bars, how many of all the bars in Vegas are the top bars. The result is that only 240 bars are top, and the rest 1139 are normal. The gap of average review counts in the two types of bars is huge. Even in Vegas, good bars are rare. But their popularity and quality are undoubtful.

topstar <- mutate(bar, topstar = ifelse(stars >= 4 & review_count >= 100, "Top bars", "Non top bars"))
toppct <- topstar %>%
  group_by(topstar) %>%
  summarize(count = n(), avgreview = mean(review_count))

## 4. The gap between topbars and non top bars
p4 <- tab_df(toppct)
p4
topstar count avgreview
Non top bars 1108 108.22
Top bars 240 529.58

In the distribution of top bars, we find Spring Valley and China town are great areas. They are easy to be ignored if we just concentrate to the location crowded with bars.

topstar <- topstar %>%
  filter(topstar == "Top bars") %>%
  group_by(neighborhood) %>%
  summarize(count = n(), avgstar = mean(stars))
topstar[1, 1] <- "Not known"
topbargraph <- ggplot(topstar, aes(x = count, y = reorder(neighborhood, count), fill = round(avgstar, 2))) + 
  geom_bar(stat="identity", width=1, color="white") +
  labs(x="review count", y="name of neighborhood")

## 5. Which neighborhood has most top bars?
p5 <- ggplotly(topbargraph)
p5

Then let us dig deeper to discover the relationship among the attributes of elite users who has reviewed in Vegas’ bars. It seems that you will be more picky when you experience more places. But you may also be more conservative to criticize when your number of fans increases. The first linear regression graph shows this trend.

## 6. What influences the elite users' review scores most who has reviewed in Las Vegas.
lm1 <- lm(average_stars ~ review_count + fans + useful + compliment_hot, user)
summary(lm1)
## 
## Call:
## lm(formula = average_stars ~ review_count + fans + useful + compliment_hot, 
##     data = user)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87542 -0.16202  0.01053  0.17069  0.75058 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.896e+00  1.803e-02 216.130  < 2e-16 ***
## review_count   -8.430e-05  1.598e-05  -5.274 1.77e-07 ***
## fans            1.525e-04  4.488e-05   3.397 0.000719 ***
## useful          1.548e-07  6.045e-07   0.256 0.798000    
## compliment_hot -2.032e-06  5.988e-06  -0.339 0.734405    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2631 on 718 degrees of freedom
## Multiple R-squared:  0.04107,    Adjusted R-squared:  0.03573 
## F-statistic: 7.689 on 4 and 718 DF,  p-value: 4.549e-06
userstars <- ggplot(data = user, aes(x = fans, y = average_stars)) +
  stat_smooth(method = "lm", col = "blue") + 
  xlab("Number of fans") + ylab("Average stars")
p6 <- ggplotly(userstars)
## `geom_smooth()` using formula 'y ~ x'
p6